{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iVeYZpy3fF0N"
      },
      "source": [
        "##### Copyright 2021 Google LLC. Licensed under the Apache License, Version 2.0 (the \"License\");"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0eXL156ae-iT"
      },
      "outputs": [],
      "source": [
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "# https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LbGgdE4mj1hd"
      },
      "source": [
        "### Step 1. Prepare a compressed CSV file using [Open Buildings](https://sites.research.google/open-buildings/) data [takes 1-15 minutes depending on the region]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "qP6ADuzRdZTF"
      },
      "outputs": [],
      "source": [
        "#@markdown First, select a region from either the [Natural Earth low res](https://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-admin-0-countries/) (fastest), [Natural Earth high res](https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries/) or [World Bank high res](https://datacatalog.worldbank.org/dataset/world-bank-official-boundaries) shapefiles:\n",
        "region_border_source = 'Natural Earth (Low Res 110m)'  #@param [\"Natural Earth (Low Res 110m)\", \"Natural Earth (High Res 10m)\", \"World Bank (High Res 10m)\"]\n",
        "region = 'GHA (Ghana)'  #@param [\"\", \"AGO (Angola)\", \"BDI (Burundi)\", \"BEN (Benin)\", \"BFA (Burkina Faso)\", \"BGD (Bangladesh)\", \"BRN (Brunei)\", \"BTN (Bhutan)\", \"BWA (Botswana)\", \"CAF (Central African Republic)\", \"CIV (C\\u00f4te d'Ivoire)\", \"CMR (Cameroon)\", \"COD (Democratic Republic of the Congo)\", \"COG (Republic of the Congo)\", \"COM (Comoros)\", \"CPV (Cape Verde)\", \"DJI (Djibouti)\", \"DZA (Algeria)\", \"EGY (Egypt)\", \"ERI (Eritrea)\", \"ETH (Ethiopia)\", \"GAB (Gabon)\", \"GHA (Ghana)\", \"GIN (Guinea)\", \"GMB (The Gambia)\", \"GNB (Guinea-Bissau)\", \"GNQ (Equatorial Guinea)\", \"IDN (Indonesia)\", \"IOT (British Indian Ocean Territory)\", \"KEN (Kenya)\", \"KHM (Cambodia)\", \"LAO (Laos)\", \"LBR (Liberia)\", \"LKA (Sri Lanka)\", \"LSO (Lesotho)\", \"MDG (Madagascar)\", \"MDV (Maldives)\", \"MOZ (Mozambique)\", \"MRT (Mauritania)\", \"MUS (Mauritius)\", \"MWI (Malawi)\", \"MYS (Malaysia)\", \"MYT (Mayotte)\", \"NAM (Namibia)\", \"NER (Niger)\", \"NGA (Nigeria)\", \"NPL (Nepal)\", \"PHL (Philippines)\", \"REU (R\\u00e9union)\", \"RWA (Rwanda)\", \"SDN (Sudan)\", \"SEN (Senegal)\", \"SGP (Singapore)\", \"SHN (Saint Helena, Ascension and Tristan da Cunha)\", \"SLE (Sierra Leone)\", \"SOM (Somalia)\", \"STP (S\\u00e3o Tom\\u00e9 and Pr\\u00edncipe)\", \"SWZ (Eswatini)\", \"SYC (Seychelles)\", \"TGO (Togo)\", \"THA (Thailand)\", \"TLS (Timor-Leste)\", \"TUN (Tunisia)\", \"TZA (Tanzania)\", \"UGA (Uganda)\", \"VNM (Vietnam)\", \"ZAF (South Africa)\", \"ZMB (Zambia)\", \"ZWE (Zimbabwe)\"]\n",
        "#@markdown **or** specify an area of interest in [WKT format](https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry) (assumes crs='EPSG:4326'); this [tool](https://arthur-e.github.io/Wicket/sandbox-gmaps3.html) might be useful.\n",
        "your_own_wkt_polygon = ''  #@param {type:\"string\"}\n",
        "#@markdown Select type of data to download here:\n",
        "data_type = 'polygons'  #@param [\"polygons\", \"points\"]\n",
        "\n",
        "!sudo apt-get install swig\n",
        "!pip install s2geometry pygeos geopandas\n",
        "\n",
        "import functools\n",
        "import glob\n",
        "import gzip\n",
        "import multiprocessing\n",
        "import os\n",
        "import shutil\n",
        "import tempfile\n",
        "from typing import List, Optional, Tuple\n",
        "\n",
        "import geopandas as gpd\n",
        "from IPython import display\n",
        "import pandas as pd\n",
        "import s2geometry as s2\n",
        "import shapely\n",
        "import tensorflow as tf\n",
        "import tqdm.notebook\n",
        "\n",
        "BUILDING_DOWNLOAD_PATH = ('gs://open-buildings-data/v3/'\n",
        "                          f'{data_type}_s2_level_6_gzip_no_header')\n",
        "\n",
        "def get_filename_and_region_dataframe(\n",
        "    region_border_source: str, region: str,\n",
        "    your_own_wkt_polygon: str) -\u003e Tuple[str, gpd.geodataframe.GeoDataFrame]:\n",
        "  \"\"\"Returns output filename and a geopandas dataframe with one region row.\"\"\"\n",
        "\n",
        "  if your_own_wkt_polygon:\n",
        "    filename = f'open_buildings_v3_{data_type}_your_own_wkt_polygon.csv.gz'\n",
        "    region_df = gpd.GeoDataFrame(\n",
        "        geometry=gpd.GeoSeries.from_wkt([your_own_wkt_polygon]),\n",
        "        crs='EPSG:4326')\n",
        "    if not isinstance(region_df.iloc[0].geometry,\n",
        "                      shapely.geometry.polygon.Polygon) and not isinstance(\n",
        "                          region_df.iloc[0].geometry,\n",
        "                          shapely.geometry.multipolygon.MultiPolygon):\n",
        "      raise ValueError(\"`your_own_wkt_polygon` must be a POLYGON or \"\n",
        "                      \"MULTIPOLYGON.\")\n",
        "    print(f'Preparing your_own_wkt_polygon.')\n",
        "    return filename, region_df\n",
        "\n",
        "  if not region:\n",
        "    raise ValueError('Please select a region or set your_own_wkt_polygon.')\n",
        "\n",
        "  if region_border_source == 'Natural Earth (Low Res 110m)':\n",
        "    url = ('https://www.naturalearthdata.com/http//www.naturalearthdata.com/'\n",
        "           'download/110m/cultural/ne_110m_admin_0_countries.zip')\n",
        "    !wget -N {url}\n",
        "    display.clear_output()\n",
        "    region_shapefile_path = os.path.basename(url)\n",
        "    source_name = 'ne_110m'\n",
        "  elif region_border_source == 'Natural Earth (High Res 10m)':\n",
        "    url = ('https://www.naturalearthdata.com/http//www.naturalearthdata.com/'\n",
        "           'download/10m/cultural/ne_10m_admin_0_countries.zip')\n",
        "    !wget -N {url}\n",
        "    display.clear_output()\n",
        "    region_shapefile_path = os.path.basename(url)\n",
        "    source_name = 'ne_10m'\n",
        "  elif region_border_source == 'World Bank (High Res 10m)':\n",
        "    url = ('https://development-data-hub-s3-public.s3.amazonaws.com/ddhfiles/'\n",
        "           '779551/wb_countries_admin0_10m.zip')\n",
        "    !wget -N {url}\n",
        "    !unzip -o {os.path.basename(url)}\n",
        "    display.clear_output()\n",
        "    region_shapefile_path = 'WB_countries_Admin0_10m'\n",
        "    source_name = 'wb_10m'\n",
        "\n",
        "  region_iso_a3 = region.split(' ')[0]\n",
        "  filename = (f'open_buildings_v3_{data_type}_'\n",
        "              f'{source_name}_{region_iso_a3}.csv.gz')\n",
        "  region_df = gpd.read_file(region_shapefile_path).query(\n",
        "      f'ISO_A3 == \"{region_iso_a3}\"').dissolve(by='ISO_A3')[['geometry']]\n",
        "  print(f'Preparing {region} from {region_border_source}.')\n",
        "  return filename, region_df\n",
        "\n",
        "\n",
        "def get_bounding_box_s2_covering_tokens(\n",
        "    region_geometry: shapely.geometry.base.BaseGeometry) -\u003e List[str]:\n",
        "  region_bounds = region_geometry.bounds\n",
        "  s2_lat_lng_rect = s2.S2LatLngRect_FromPointPair(\n",
        "      s2.S2LatLng_FromDegrees(region_bounds[1], region_bounds[0]),\n",
        "      s2.S2LatLng_FromDegrees(region_bounds[3], region_bounds[2]))\n",
        "  coverer = s2.S2RegionCoverer()\n",
        "  # NOTE: Should be kept in-sync with s2 level in BUILDING_DOWNLOAD_PATH.\n",
        "  coverer.set_fixed_level(6)\n",
        "  coverer.set_max_cells(1000000)\n",
        "  return [cell.ToToken() for cell in coverer.GetCovering(s2_lat_lng_rect)]\n",
        "\n",
        "\n",
        "def s2_token_to_shapely_polygon(\n",
        "    s2_token: str) -\u003e shapely.geometry.polygon.Polygon:\n",
        "  s2_cell = s2.S2Cell(s2.S2CellId_FromToken(s2_token, len(s2_token)))\n",
        "  coords = []\n",
        "  for i in range(4):\n",
        "    s2_lat_lng = s2.S2LatLng(s2_cell.GetVertex(i))\n",
        "    coords.append((s2_lat_lng.lng().degrees(), s2_lat_lng.lat().degrees()))\n",
        "  return shapely.geometry.Polygon(coords)\n",
        "\n",
        "\n",
        "def download_s2_token(\n",
        "    s2_token: str, region_df: gpd.geodataframe.GeoDataFrame) -\u003e Optional[str]:\n",
        "  \"\"\"Downloads the matching CSV file with polygons for the `s2_token`.\n",
        "\n",
        "  NOTE: Only polygons inside the region are kept.\n",
        "  NOTE: Passing output via a temporary file to reduce memory usage.\n",
        "\n",
        "  Args:\n",
        "    s2_token: S2 token for which to download the CSV file with building\n",
        "      polygons. The S2 token should be at the same level as the files in\n",
        "      BUILDING_DOWNLOAD_PATH.\n",
        "    region_df: A geopandas dataframe with only one row that contains the region\n",
        "      for which to keep polygons.\n",
        "\n",
        "  Returns:\n",
        "    Either filepath which contains a gzipped CSV without header for the\n",
        "    `s2_token` subfiltered to only contain building polygons inside the region\n",
        "    or None which means that there were no polygons inside the region for this\n",
        "    `s2_token`.\n",
        "  \"\"\"\n",
        "  s2_cell_geometry = s2_token_to_shapely_polygon(s2_token)\n",
        "  region_geometry = region_df.iloc[0].geometry\n",
        "  prepared_region_geometry = shapely.prepared.prep(region_geometry)\n",
        "  # If the s2 cell doesn't intersect the country geometry at all then we can\n",
        "  # know that all rows would be dropped so instead we can just return early.\n",
        "  if not prepared_region_geometry.intersects(s2_cell_geometry):\n",
        "    return None\n",
        "  try:\n",
        "    # Using tf.io.gfile.GFile gives better performance than passing the GCS path\n",
        "    # directly to pd.read_csv.\n",
        "    with tf.io.gfile.GFile(\n",
        "        os.path.join(BUILDING_DOWNLOAD_PATH, f'{s2_token}_buildings.csv.gz'),\n",
        "        'rb') as gf:\n",
        "      # If the s2 cell is fully covered by country geometry then can skip\n",
        "      # filtering as we need all rows.\n",
        "      if prepared_region_geometry.covers(s2_cell_geometry):\n",
        "        with tempfile.NamedTemporaryFile(mode='w+b', delete=False) as tmp_f:\n",
        "          shutil.copyfileobj(gf, tmp_f)\n",
        "          return tmp_f.name\n",
        "      # Else take the slow path.\n",
        "      # NOTE: We read in chunks to save memory.\n",
        "      csv_chunks = pd.read_csv(\n",
        "          gf, chunksize=2000000, dtype=object, compression='gzip', header=None)\n",
        "      tmp_f = tempfile.NamedTemporaryFile(mode='w+b', delete=False)\n",
        "      tmp_f.close()\n",
        "      for csv_chunk in csv_chunks:\n",
        "        points = gpd.GeoDataFrame(\n",
        "            geometry=gpd.points_from_xy(csv_chunk[1], csv_chunk[0]),\n",
        "            crs='EPSG:4326')\n",
        "        # sjoin 'within' was faster than using shapely's 'within' directly.\n",
        "        points = gpd.sjoin(points, region_df, predicate='within')\n",
        "        csv_chunk = csv_chunk.iloc[points.index]\n",
        "        csv_chunk.to_csv(\n",
        "            tmp_f.name,\n",
        "            mode='ab',\n",
        "            index=False,\n",
        "            header=False,\n",
        "            compression={\n",
        "                'method': 'gzip',\n",
        "                'compresslevel': 1\n",
        "            })\n",
        "      return tmp_f.name\n",
        "  except tf.errors.NotFoundError:\n",
        "    return None\n",
        "\n",
        "# Clear output after pip install.\n",
        "display.clear_output()\n",
        "filename, region_df = get_filename_and_region_dataframe(region_border_source,\n",
        "                                                        region,\n",
        "                                                        your_own_wkt_polygon)\n",
        "# Remove any old outputs to not run out of disk.\n",
        "for f in glob.glob('/tmp/open_buildings_*'):\n",
        "  os.remove(f)\n",
        "# Write header to the compressed CSV file.\n",
        "with gzip.open(f'/tmp/{filename}', 'wt') as merged:\n",
        "  if data_type == \"polygons\":\n",
        "    merged.write(','.join([\n",
        "        'latitude', 'longitude', 'area_in_meters', 'confidence', 'geometry',\n",
        "        'full_plus_code'\n",
        "    ]) + '\\n')\n",
        "  else:\n",
        "    merged.write(','.join([\n",
        "        'latitude', 'longitude', 'area_in_meters', 'confidence', \n",
        "        'full_plus_code'\n",
        "    ]) + '\\n')\n",
        "download_s2_token_fn = functools.partial(download_s2_token, region_df=region_df)\n",
        "s2_tokens = get_bounding_box_s2_covering_tokens(region_df.iloc[0].geometry)\n",
        "# Downloads CSV files for relevant S2 tokens and after filtering appends them\n",
        "# to the compressed output CSV file. Relies on the fact that concatenating\n",
        "# gzipped files produces a valid gzip file.\n",
        "# NOTE: Uses a pool to speed up output preparation.\n",
        "with open(f'/tmp/{filename}', 'ab') as merged:\n",
        "  with multiprocessing.Pool(4) as e:\n",
        "    for fname in tqdm.notebook.tqdm(\n",
        "        e.imap_unordered(download_s2_token_fn, s2_tokens),\n",
        "        total=len(s2_tokens)):\n",
        "      if fname:\n",
        "        with open(fname, 'rb') as tmp_f:\n",
        "          shutil.copyfileobj(tmp_f, merged)\n",
        "        os.unlink(fname)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fwxfj3B1qUWu"
      },
      "source": [
        "### Step 2. Access the generated CSV\n",
        "\n",
        "We currently support two options:\n",
        "\n",
        "*   Upload to your Google Drive (requires authentication) and then download manually from Google Drive [recommended, very fast]\n",
        "*   Download directly from the Colab [not recommended, very slow]\n",
        "\n",
        "When asked to do the authentication please follow the url, paste the generated token into the text field and press enter.\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "6p5iY0WexcJC"
      },
      "outputs": [],
      "source": [
        "#@title Upload result to Google Drive [fast, requires authentication]\n",
        "\n",
        "from google.colab import auth\n",
        "from googleapiclient.http import MediaFileUpload\n",
        "from googleapiclient.discovery import build\n",
        "\n",
        "auth.authenticate_user()\n",
        "\n",
        "drive_service = build('drive', 'v3')\n",
        "file_metadata = {\n",
        "  'name': filename,\n",
        "  'mimeType': 'application/gzip'\n",
        "}\n",
        "media = MediaFileUpload(f'/tmp/{filename}',\n",
        "                        mimetype='application/gzip',\n",
        "                        resumable=True)\n",
        "created = drive_service.files().create(body=file_metadata,\n",
        "                                       media_body=media,\n",
        "                                       fields='id').execute()\n",
        "print('Upload to Google Drive done.')\n",
        "print(f'Please download {file_metadata[\"name\"]} manually from Google Drive.')\n",
        "print(f'Search link: https://drive.google.com/drive/search?q={filename}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pm-hsiYIt7x2"
      },
      "outputs": [],
      "source": [
        "#@title Download result directly [slow]\n",
        "\n",
        "from google.colab import files\n",
        "\n",
        "files.download(f'/tmp/{filename}')"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "last_runtime": {
        "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook",
        "kind": "private"
      },
      "name": "Open Buildings - download region polygons or points.",
      "private_outputs": true,
      "provenance": [
        {
          "file_id": "1fK7dY741MBONdcfVSLnzEh8vJlBaT89f",
          "timestamp": 1626347718873
        }
      ]
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
